import codecs
import jieba
import xlrd
import os
# 读取文件
def read_lines(filename):
    fp = open(filename, 'r',encoding='utf-8')
    lines = []
    for line in fp.readlines():
        line = line.strip()
        line = line
        lines.append(line)
    fp.close()
    return lines

#读取文件夹下所有文档列表
def doc_list(newsFoldersPath):
    newsFoldersPath = "news"  # 文档地址
    pathDir = os.listdir(newsFoldersPath)
    child=[]
    for allDir in pathDir:
        child .append(os.path.join('%s/%s' % (newsFoldersPath, allDir)))
        print(child)
    return  child

# 分词，返回List
def segmentation(sentence):
    sentence.encode('utf-8')
    seg_list = jieba.cut(sentence)
    seg_result = []
    for w in seg_list:
        # if w not in stop:
        seg_result.append(w)
    # print seg_result[:]
    return seg_result